/*
 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/*
    Recursive Gaussian filter
*/

#ifndef _RECURSIVEGAUSSIAN_KERNEL_CU_
#define _RECURSIVEGAUSSIAN_KERNEL_CU_

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <cutil_inline.h>    // includes cuda.h and cuda_runtime_api.h
#include <cutil_math.h>

#define BLOCK_DIM 16
#define CLAMP_TO_EDGE 1

// Transpose kernel (see transpose SDK sample for details)
__global__ void d_transpose(uint *odata, uint *idata, int width, int height)
{
    __shared__ uint block[BLOCK_DIM][BLOCK_DIM+1];
    
    // read the matrix tile into shared memory
    unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x;
    unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y;
    if((xIndex < width) && (yIndex < height))
    {
        unsigned int index_in = yIndex * width + xIndex;
        block[threadIdx.y][threadIdx.x] = idata[index_in];
    }

    __syncthreads();

    // write the transposed matrix tile to global memory
    xIndex = blockIdx.y * BLOCK_DIM + threadIdx.x;
    yIndex = blockIdx.x * BLOCK_DIM + threadIdx.y;
    if((xIndex < height) && (yIndex < width))
    {
        unsigned int index_out = yIndex * height + xIndex;
        odata[index_out] = block[threadIdx.x][threadIdx.y];
    }
}

// RGBA version
// reads from 32-bit uint array holding 8-bit RGBA

// convert floating point rgba color to 32-bit integer
__device__ uint rgbaFloatToInt(float4 rgba)
{
    rgba.x = __saturatef(rgba.x);   // clamp to [0.0, 1.0]
    rgba.y = __saturatef(rgba.y);
    rgba.z = __saturatef(rgba.z);
    rgba.w = __saturatef(rgba.w);
    return (uint(rgba.w*255)<<24) | (uint(rgba.z*255)<<16) | (uint(rgba.y*255)<<8) | uint(rgba.x*255);
}

// convert from 32-bit int to float4
__device__ float4 rgbaIntToFloat(uint c)
{
    float4 rgba;
    rgba.x = (c & 0xff) / 255.0f;
    rgba.y = ((c>>8) & 0xff) / 255.0f;
    rgba.z = ((c>>16) & 0xff) / 255.0f;
    rgba.w = ((c>>24) & 0xff) / 255.0f;
    return rgba;
}

/*
	simple 1st order recursive filter
	- processes one image column per thread

	parameters:	
	id - pointer to input data (RGBA image packed into 32-bit integers)
	od - pointer to output data 
	w  - image width
	h  - image height
	a  - blur parameter
*/

__global__ void
d_simpleRecursive_rgba(uint *id, uint *od, int w, int h, float a)
{
    unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
	if (x >= w) return;
    
    id += x;    // advance pointers to correct column
    od += x;

    // forward pass
    float4 yp = rgbaIntToFloat(*id);  // previous output
    for (int y = 0; y < h; y++) {
        float4 xc = rgbaIntToFloat(*id);
        float4 yc = xc + a*(yp - xc);   // simple lerp between current and previous value
		*od = rgbaFloatToInt(yc);
        id += w; od += w;    // move to next row
        yp = yc;
    }

    // reset pointers to point to last element in column
    id -= w;
    od -= w;

    // reverse pass
    // ensures response is symmetrical
    yp = rgbaIntToFloat(*id);
    for (int y = h-1; y >= 0; y--) {
        float4 xc = rgbaIntToFloat(*id);
        float4 yc = xc + a*(yp - xc);
		*od = rgbaFloatToInt((rgbaIntToFloat(*od) + yc)*0.5f);
        id -= w; od -= w;  // move to previous row
        yp = yc;
    }
}

/*
	recursive Gaussian filter

	parameters:	
	id - pointer to input data (RGBA image packed into 32-bit integers)
	od - pointer to output data 
	w  - image width
	h  - image height
	a0-a3, b1, b2, coefp, coefn - filter parameters
*/

__global__ void
d_recursiveGaussian_rgba(uint *id, uint *od, int w, int h, float a0, float a1, float a2, float a3, float b1, float b2, float coefp, float coefn)
{
    unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
	if (x >= w) return;
	
    id += x;    // advance pointers to correct column
    od += x;

    // forward pass
    float4 xp = make_float4(0.0f);  // previous input
    float4 yp = make_float4(0.0f);  // previous output
    float4 yb = make_float4(0.0f);  // previous output by 2
#if CLAMP_TO_EDGE
    xp = rgbaIntToFloat(*id); yb = coefp*xp; yp = yb;
#endif
    for (int y = 0; y < h; y++) {
        float4 xc = rgbaIntToFloat(*id);
        float4 yc = a0*xc + a1*xp - b1*yp - b2*yb;
		*od = rgbaFloatToInt(yc);
        id += w; od += w;    // move to next row
        xp = xc; yb = yp; yp = yc; 
    }

    // reset pointers to point to last element in column
    id -= w;
    od -= w;

    // reverse pass
    // ensures response is symmetrical
    float4 xn = make_float4(0.0f);
    float4 xa = make_float4(0.0f);
    float4 yn = make_float4(0.0f);
    float4 ya = make_float4(0.0f);
#if CLAMP_TO_EDGE
    xn = xa = rgbaIntToFloat(*id); yn = coefn*xn; ya = yn;
#endif
    for (int y = h-1; y >= 0; y--) {
        float4 xc = rgbaIntToFloat(*id);
        float4 yc = a2*xn + a3*xa - b1*yn - b2*ya;
        xa = xn; xn = xc; ya = yn; yn = yc;
		*od = rgbaFloatToInt(rgbaIntToFloat(*od) + yc);
        id -= w; od -= w;  // move to previous row
    }
}

#endif // #ifndef _GAUSSIAN_KERNEL_H_
